library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
✔ broom        1.0.7     ✔ rsample      1.2.1
✔ dials        1.4.0     ✔ tune         1.3.0
✔ infer        1.0.7     ✔ workflows    1.2.0
✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
✔ parsnip      1.3.1     ✔ yardstick    1.3.2
✔ recipes      1.1.1     
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
library(powerjoin)
library(glue)
library(vip)

Attaching package: 'vip'

The following object is masked from 'package:utils':

    vi
library(baguette)
library(ggplot2)
library(patchwork)
root <- 'https://gdex.ucar.edu/dataset/camels/file'
download.file('https://gdex.ucar.edu/dataset/camels/file/camels_attributes_v2.0.pdf', 
              'data/camels_attributes_v2.0.pdf')
types <- c("clim", "geol", "soil", "topo", "vege", "hydro")
remote_files <- glue('{root}/camels_{types}.txt')
local_files <- glue('data/camels_{types}.txt')
walk2(remote_files, local_files, download.file, quiet = TRUE)
camels <- map(local_files, read_delim, show_col_types = FALSE)
camels <- power_full_join(camels, by = 'gauge_id')

Question 1

Zero_q_freq means frequency of days with Q (dishcharge)= 0 mm per day.

ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
  borders("state", colour = "gray50") + 
  geom_point(aes(color = q_mean)) + scale_color_gradient(low = "pink", high = "dodgerblue") +
  ggthemes::theme_map()

Question 2

map_aridity <- ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
  borders("state", colour = "gray50") + 
  geom_point(aes(color = aridity)) + scale_color_gradient(low = "lightblue", high = "darkred") +
  labs(title = "Aridity Data from USGS sites in the US",
       color = "Aridity") +
  ggthemes::theme_map()
print(map_aridity)

map_p_mean <- ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
  borders("state", colour = "gray50") + 
  geom_point(aes(color = p_mean)) + scale_color_gradient(low = "lightblue", high = "darkblue") +
  labs(title = "Mean Precipitation from USGS sites in the US",
       color = "Mean Precipitation") +
  ggthemes::theme_map()
print(map_p_mean)

final_maps <- map_aridity + map_p_mean + plot_layout(ncol = 2)
print(final_maps)

ggsave("final_maps.png", plot = final_maps, width = 10, height = 6, dpi = 300)
camels %>% 
  select(aridity, p_mean, q_mean) %>% 
  drop_na() %>% 
  cor()
           aridity     p_mean     q_mean
aridity  1.0000000 -0.7550090 -0.5817771
p_mean  -0.7550090  1.0000000  0.8865757
q_mean  -0.5817771  0.8865757  1.0000000
ggplot(camels, aes(x = aridity, y = p_mean)) +
  geom_point(aes(color = q_mean)) +
  geom_smooth(method = "lm", color = "red", linetype = 2) +
  scale_color_viridis_c() +
  theme_linedraw() +
  theme(legend.position = "bottom") +
  labs(title = "Aridity vs Rainfall vs Runnoff",
       x = "Aridity",
       y = "Rainfall",
       color = "Mean Flow")
`geom_smooth()` using formula = 'y ~ x'

ggplot(camels, aes(x = aridity, y = p_mean)) +
  geom_point(aes(color = q_mean)) +
  geom_smooth(method = "lm") +
  scale_color_viridis_c() +
  scale_x_log10() +
  scale_y_log10() +
  theme_linedraw() +
  theme(legend.position = "bottom") +
  labs(title = "Aridity vs Rainfall vs Runoff",
       x = "Aridity",
       y = "Rainfall",
       color = "Mean Flow")
`geom_smooth()` using formula = 'y ~ x'

ggplot(camels, aes(x = aridity, y = p_mean)) +
  geom_point(aes(color = q_mean)) +
  geom_smooth(method = "lm") +
  scale_color_viridis_c(trans = "log") +
  scale_x_log10() + 
  scale_y_log10() +
  theme_linedraw() +
  theme(legend.position = "bottom",
        legend.key.width = unit(2.5, "cm"),
        legend.key.height = unit(.5, "cm")) + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow") 
`geom_smooth()` using formula = 'y ~ x'

set.seed(123)
camels <- camels %>% 
  mutate(logQmean = log(q_mean))

camels_split <- initial_split(camels, prop = 0.8)
camels_train <- training(camels_split)
camels_test <- testing(camels_split)

camels_cv <- vfold_cv(camels_train, v = 10)
rec <- recipe(logQmean ~ aridity + p_mean, data = camels_train) %>% 
  step_log(all_predictors()) %>% 
  step_interact(terms = ~ aridity:p_mean) %>% 
  step_naomit(all_predictors(), all_outcomes())
baked_data <- prep(rec, camels_train) %>% 
  bake(new_data = NULL)

lm_base <- lm(logQmean ~ aridity * p_mean, data = baked_data)
summary(lm_base)

Call:
lm(formula = logQmean ~ aridity * p_mean, data = baked_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.91162 -0.21601 -0.00716  0.21230  2.85706 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -1.77586    0.16365 -10.852  < 2e-16 ***
aridity        -0.88397    0.16145  -5.475 6.75e-08 ***
p_mean          1.48438    0.15511   9.570  < 2e-16 ***
aridity:p_mean  0.10484    0.07198   1.457    0.146    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.5696 on 531 degrees of freedom
Multiple R-squared:  0.7697,    Adjusted R-squared:  0.7684 
F-statistic: 591.6 on 3 and 531 DF,  p-value: < 2.2e-16
summary(lm(logQmean ~ aridity + p_mean + aridity_x_p_mean, data = baked_data))

Call:
lm(formula = logQmean ~ aridity + p_mean + aridity_x_p_mean, 
    data = baked_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.91162 -0.21601 -0.00716  0.21230  2.85706 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      -1.77586    0.16365 -10.852  < 2e-16 ***
aridity          -0.88397    0.16145  -5.475 6.75e-08 ***
p_mean            1.48438    0.15511   9.570  < 2e-16 ***
aridity_x_p_mean  0.10484    0.07198   1.457    0.146    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.5696 on 531 degrees of freedom
Multiple R-squared:  0.7697,    Adjusted R-squared:  0.7684 
F-statistic: 591.6 on 3 and 531 DF,  p-value: < 2.2e-16
test_data <- bake(prep(rec), new_data = camels_test)
test_data$lm_pred <- predict(lm_base, newdata = test_data)
metrics(test_data, truth = logQmean, estimate = lm_pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.583
2 rsq     standard       0.742
3 mae     standard       0.390
ggplot(test_data, aes(x = logQmean, y = lm_pred, colour = aridity)) +
  scale_color_gradient2(low = "brown", mid = "orange", high = "darkgreen") +
  geom_point() +
  geom_abline(linetype = 2) +
  theme_linedraw() +
  labs(title = "Linear Model: Observed vs Predicted",
       x = "Observes Log Mean Flow",
       y = "Predicted Log Mean Flow",
       color = "Aridity")

lm_model <- linear_reg() %>% 
  set_engine("lm") %>% 
  set_mode("regression")

lm_wf <- workflow() %>% 
  add_recipe(rec) %>% 
  add_model(lm_model) %>% 
  fit(data = camels_train)

summary(extract_fit_engine(lm_wf))$coefficients
                   Estimate Std. Error    t value     Pr(>|t|)
(Intercept)      -1.7758557 0.16364755 -10.851710 6.463654e-25
aridity          -0.8839738 0.16144589  -5.475357 6.745512e-08
p_mean            1.4843771 0.15511117   9.569762 4.022500e-20
aridity_x_p_mean  0.1048449 0.07198145   1.456555 1.458304e-01
summary(lm_base)$coefficients
                 Estimate Std. Error    t value     Pr(>|t|)
(Intercept)    -1.7758557 0.16364755 -10.851710 6.463654e-25
aridity        -0.8839738 0.16144589  -5.475357 6.745512e-08
p_mean          1.4843771 0.15511117   9.569762 4.022500e-20
aridity:p_mean  0.1048449 0.07198145   1.456555 1.458304e-01
lm_data <- augment(lm_wf, new_data = camels_test)
dim(lm_data)
[1] 135  61
metrics(lm_data, truth = logQmean, estimate = .pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.583
2 rsq     standard       0.742
3 mae     standard       0.390
ggplot(lm_data, aes(x = logQmean, y = .pred, colour = aridity)) +
  scale_color_viridis_c() +
  geom_point() +
  geom_abline() +
  theme_linedraw()

library(baguette)
rf_model <- rand_forest() %>%
  set_engine("ranger", importance = "impurity") %>%
  set_mode("regression")

rf_wf <- workflow() %>%
  add_recipe(rec) %>%
  add_model(rf_model) %>%
  fit(data = camels_train) 
rf_data <- augment(rf_wf, new_data = camels_test)
dim(rf_data)
[1] 135  60
metrics(rf_data, truth = logQmean, estimate = .pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.587
2 rsq     standard       0.741
3 mae     standard       0.363
ggplot(rf_data, aes(x = logQmean, y = .pred, colour = aridity)) +
  scale_color_viridis_c() +
  geom_point() +
  geom_abline() +
  theme_linedraw()

wf <- workflow_set(list(rec), list(lm_model, rf_model)) %>%
  workflow_map('fit_resamples', resamples = camels_cv) 

autoplot(wf)

rank_results(wf, rank_metric = "rsq", select_best = TRUE)
# A tibble: 4 × 9
  wflow_id          .config .metric  mean std_err     n preprocessor model  rank
  <chr>             <chr>   <chr>   <dbl>   <dbl> <int> <chr>        <chr> <int>
1 recipe_rand_fore… Prepro… rmse    0.563  0.0247    10 recipe       rand…     1
2 recipe_rand_fore… Prepro… rsq     0.771  0.0259    10 recipe       rand…     1
3 recipe_linear_reg Prepro… rmse    0.569  0.0260    10 recipe       line…     2
4 recipe_linear_reg Prepro… rsq     0.770  0.0223    10 recipe       line…     2

Question 3

xgb_model <- boost_tree() %>% 
  set_engine("xgboost") %>% 
  set_mode("regression")

nn_model <- bag_mlp() %>% 
  set_engine("nnet") %>% 
  set_mode("regression")
wf <- workflow_set(
  list(rec),
  list(
    lm_model = lm_model,
    rf_model = rf_model,
    xgb_model = xgb_model,
    nn_model = nn_model
  )
) %>% 
  workflow_map("fit_resamples", resamples = camels_cv)
rank_results(wf, rank_metric = "rsq", select_best = TRUE)
# A tibble: 8 × 9
  wflow_id         .config  .metric  mean std_err     n preprocessor model  rank
  <chr>            <chr>    <chr>   <dbl>   <dbl> <int> <chr>        <chr> <int>
1 recipe_nn_model  Preproc… rmse    0.547  0.0308    10 recipe       bag_…     1
2 recipe_nn_model  Preproc… rsq     0.787  0.0266    10 recipe       bag_…     1
3 recipe_rf_model  Preproc… rmse    0.565  0.0247    10 recipe       rand…     2
4 recipe_rf_model  Preproc… rsq     0.770  0.0258    10 recipe       rand…     2
5 recipe_lm_model  Preproc… rmse    0.569  0.0260    10 recipe       line…     3
6 recipe_lm_model  Preproc… rsq     0.770  0.0223    10 recipe       line…     3
7 recipe_xgb_model Preproc… rmse    0.600  0.0289    10 recipe       boos…     4
8 recipe_xgb_model Preproc… rsq     0.745  0.0268    10 recipe       boos…     4
# The random forest model was ranked the best, so I would move forward with this one. It was ranked as the model with the highest r-squared.

Question 4

set.seed(123)

camels_split2 <- initial_split(camels, prop = 0.75)
camels_train2 <- training(camels_split2)
camels_test2 <- testing(camels_split2)

camels_cv2 <- vfold_cv(camels_train2, v = 10)
rec2 <- recipe(logQmean ~ aridity + p_mean, data = camels_train2) %>% 
  step_log(all_predictors()) %>% 
  step_interact(terms = ~ aridity:p_mean) %>% 
  step_naomit(all_predictors(), all_outcomes())

# I chose to use aridity and mean precipitation to predict logQmean because they would both have an impact on mean daily discharge, especially precipitation.
rf_model2 <- rand_forest() %>% 
  set_engine("ranger") %>% 
  set_mode("regression")

lm_model2 <- linear_reg() %>% 
  set_engine("lm") %>% 
  set_mode("regression")

nn_model2 <- bag_mlp() %>% 
  set_engine("nnet") %>% 
  set_mode("regression")
wf2 <- workflow_set(list(rec2), list(rf_model2, lm_model2, nn_model2)) %>%
  workflow_map('fit_resamples', resamples = camels_cv) 
autoplot(wf2)

rank_results(wf2, rank_metric = "rsq", select_best = TRUE)
# A tibble: 6 × 9
  wflow_id          .config .metric  mean std_err     n preprocessor model  rank
  <chr>             <chr>   <chr>   <dbl>   <dbl> <int> <chr>        <chr> <int>
1 recipe_linear_reg Prepro… rmse    0.569  0.0260    10 recipe       line…     1
2 recipe_linear_reg Prepro… rsq     0.770  0.0223    10 recipe       line…     1
3 recipe_rand_fore… Prepro… rmse    0.565  0.0245    10 recipe       rand…     2
4 recipe_rand_fore… Prepro… rsq     0.769  0.0257    10 recipe       rand…     2
5 recipe_bag_mlp    Prepro… rmse    0.654  0.112     10 recipe       bag_…     3
6 recipe_bag_mlp    Prepro… rsq     0.745  0.0466    10 recipe       bag_…     3
# I think the nn_model2 is the best because it has the lowest rmse and the highest r-squared. It is also ranked the best below.
nn_wf <- workflow() %>% 
  add_recipe(rec2) %>% 
  add_model(nn_model2) %>% 
  fit(data = camels_train2)

nn_data <- augment(nn_wf, new_data = camels_test2)
dim(nn_data)
[1] 168  61
ggplot(nn_data, aes(x = logQmean, y = .pred, colour = aridity)) +
  scale_color_viridis_c() +
  geom_point() +
  geom_abline() +
  theme_linedraw() +
  labs(title = "Observed vs. Predicted Values", x = "Log Mean Discharge", y = "Predicted", color = "Aridity")

# I think the results look great as most points align, although there are some that deviate. I think this model overall did a great job of predicting the values.